
## Generate race/ethnicity variables for inventors
## Zihao Li. 06/2024

# All race variables are based on probability >= 80. For other specifications, change varnames accordingly.

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from _funcs import parallel_read_csv, get_chunks, read_csv_chunk

dir = r'/Volumes/Zihao_SSD2/PatentsView/'  

def main():
    ## Load race data: g_inventor_gender_race_age.csv
    print('Loading race data...')
    race = parallel_read_csv(dir + 'cleandata/g_inventor_gender_race_age.csv', file_type='csv')
    race = race[['patent_id', 'race80']]
    race_df = race.groupby('patent_id').agg({'race80': lambda x: x.tolist()}).reset_index().astype({'patent_id': str}).sort_values(by=['patent_id'])
    print('Shape of gender dataset (grouped by patent_id) is {}.'.format(race_df.shape)) # (8256193, 2)

    ## Loading patent-level data: g_patent_clean_final_g.csv
    print('Merging with patent-level data...')
    df = parallel_read_csv(dir + 'temp/g_patent_clean_final_g.csv')
    print('Shape of patent-level data is {}.'.format(df.shape)) # (7481690, 130)

    df['patent_id'] = df['patent_id'].astype(str)
    race_df['patent_id'] = race_df['patent_id'].astype(str)

    if df['patent_id'].nunique() == len(df) and race_df['patent_id'].nunique() == len(race_df):
        df = df.merge(race_df, how='left', on='patent_id')
    else:
        print('Merge is not one-to-one')
    del race_df
    df = df.dropna(subset=['race80'])
    print('Shape of merged dataset (after dropping NAN) is {}.'.format(df.shape)) # (7481690, 131)

    # Generate race variables
    print('Generating race variables...')
    # Indicator variables
    df.loc[~df['race80'].isna(), 'allwhite_80'] = df.loc[~df['race80'].isna()].apply(lambda x: 1 if (('asian' not in x.race80) & ('hispanic' not in x.race80) & ('nh_black' not in x.race80) & ('ambiguous' not in x.race80)) else 0, axis=1).astype(int)
    df.loc[~df['race80'].isna(), 'allasian_80'] = df.loc[~df['race80'].isna()].apply(lambda x: 1 if (('nh_white' not in x.race80) & ('hispanic' not in x.race80) & ('nh_black' not in x.race80) & ('ambiguous' not in x.race80)) else 0, axis=1).astype(int)
    df.loc[~df['race80'].isna(), 'allblack_80'] = df.loc[~df['race80'].isna()].apply(lambda x: 1 if (('nh_white' not in x.race80) & ('hispanic' not in x.race80) & ('asian' not in x.race80) & ('ambiguous' not in x.race80)) else 0, axis=1).astype(int)
    df.loc[~df['race80'].isna(), 'allhispanic_80'] = df.loc[~df['race80'].isna()].apply(lambda x: 1 if (('nh_white' not in x.race80) & ('asian' not in x.race80) & ('nh_black' not in x.race80) & ('ambiguous' not in x.race80)) else 0, axis=1).astype(int)
    df.loc[~df['race80'].isna(), 'mixed_race80'] = ((df['allwhite_80'] == 0) & (df['allasian_80'] == 0) & (df['allblack_80'] == 0) & (df['allhispanic_80'] == 0)).astype(int)

    df['existwhite_80'] = [1 if 'nh_white' in lst else 0 for lst in df['race80']]
    df['existasian_80'] = [1 if 'asian' in lst else 0 for lst in df['race80']]
    df['existminority_80'] = [1 if (('hispanic' in lst) | ('nh_black' in lst)) else 0 for lst in df['race80']]
    df.loc[~df['race80'].isna(), 'undetermined_race80'] = ((df['allwhite_80'] == 0) & (df['allasian_80'] == 0) & (df['existminority_80'] == 0)).astype(int)

    # Categorical variable
    df.loc[df['allwhite_80']==1, 'racediscrete_80'] = 'allwhite'
    df.loc[df['allasian_80']==1, 'racediscrete_80'] = 'allasian'
    df.loc[df['allblack_80']==1, 'racediscrete_80'] = 'allblack'
    df.loc[df['allhispanic_80']==1, 'racediscrete_80'] = 'allhispanic'
    df.loc[df['mixed_race80']==1, 'racediscrete_80'] = 'mixed'

    # Drop patent_id with letters, and convert patent_id back to numeric
    print('Dropping non-numeric patent_id and converting to numeric...')
    df = df[~df["patent_id"].str.contains("[a-zA-Z]")]
    df["patent_id"] = pd.to_numeric(df["patent_id"], errors="raise")
    print('Shape of final dataset is {}.'.format(df.shape)) # (7481690, 145)
    df.rename(columns={'race80': 'race80_list'}, inplace=True)

    print('Exporting the final dataset...')
    df.to_csv(dir + 'cleandata/g_patent_clean_final.csv', index=False) 


if __name__ == '__main__':
    main()